# This script conducts analysis based on the AWAY GAMES ONLY player file.
# Player file created by the script nba_stats_create_player_file_AWAY_GAMES_ONLY.R.
# It prints verbose descriptions of the results for use in constructing the tables and graphs for the paper.

# Read in the player file.
# Run a paired t-test.
# Output summary statistics.
# Output corrected standard errors.

# Read in the player file.
setwd("F:/Experiment Data/Sleep/NBA/EastWestGamefiles")
nba_player_file = read.csv("Total_Game_Stats_11_7_AWAY_GAMES_ONLY.csv", header=TRUE)

# Define a standard error function.
se <- function(x) sqrt(var(x,na.rm=TRUE) / length(na.omit(x)))

# The following helper functions were borrowed from:
# http://www.cookbook-r.com/Graphs/Plotting_means_and_error_bars_(ggplot2)/

## Gives count, mean, standard deviation, standard error of the mean, and confidence interval (default 95%).
##   data: a data frame.
##   measurevar: the name of a column that contains the variable to be summariezed
##   groupvars: a vector containing names of columns that contain grouping variables
##   na.rm: a boolean that indicates whether to ignore NA's
##   conf.interval: the percent range of the confidence interval (default is 95%)
summarySE <- function(data=NULL, measurevar, groupvars=NULL, na.rm=FALSE,
                      conf.interval=.95, .drop=TRUE) {
    library(plyr)

    # New version of length which can handle NA's: if na.rm==T, don't count them
    length2 <- function (x, na.rm=FALSE) {
        if (na.rm) sum(!is.na(x))
        else       length(x)
    }

    # This does the summary. For each group's data frame, return a vector with
    # N, mean, and sd
    datac <- ddply(data, groupvars, .drop=.drop,
      .fun = function(xx, col) {
        c(N    = length2(xx[[col]], na.rm=na.rm),
          mean = mean   (xx[[col]], na.rm=na.rm),
          sd   = sd     (xx[[col]], na.rm=na.rm)
        )
      },
      measurevar
    )

    # Rename the "mean" column    
    datac <- rename(datac, c("mean" = measurevar))

    datac$se <- datac$sd / sqrt(datac$N)  # Calculate standard error of the mean

    # Confidence interval multiplier for standard error
    # Calculate t-statistic for confidence interval: 
    # e.g., if conf.interval is .95, use .975 (above/below), and use df=N-1
    ciMult <- qt(conf.interval/2 + .5, datac$N-1)
    datac$ci <- datac$se * ciMult

    return(datac)
}

## Norms the data within specified groups in a data frame; it normalizes each
## subject (identified by idvar) so that they have the same mean, within each group
## specified by betweenvars.
##   data: a data frame.
##   idvar: the name of a column that identifies each subject (or matched subjects)
##   measurevar: the name of a column that contains the variable to be summariezed
##   betweenvars: a vector containing names of columns that are between-subjects variables
##   na.rm: a boolean that indicates whether to ignore NA's
normDataWithin <- function(data=NULL, idvar, measurevar, betweenvars=NULL,
                           na.rm=FALSE, .drop=TRUE) {
    library(plyr)

    # Measure var on left, idvar + between vars on right of formula.
    data.subjMean <- ddply(data, c(idvar, betweenvars), .drop=.drop,
     .fun = function(xx, col, na.rm) {
        c(subjMean = mean(xx[,col], na.rm=na.rm))
      },
      measurevar,
      na.rm
    )

    # Put the subject means with original data
    data <- merge(data, data.subjMean)

    # Get the normalized data in a new column
    measureNormedVar <- paste(measurevar, "_norm", sep="")
    data[,measureNormedVar] <- data[,measurevar] - data[,"subjMean"] +
                               mean(data[,measurevar], na.rm=na.rm)

    # Remove this subject mean column
    data$subjMean <- NULL

    return(data)
}

## Summarizes data, handling within-subjects variables by removing inter-subject variability.
## It will still work if there are no within-S variables.
## Gives count, un-normed mean, normed mean (with same between-group mean),
##   standard deviation, standard error of the mean, and confidence interval.
## If there are within-subject variables, calculate adjusted values using method from Morey (2008).
##   data: a data frame.
##   measurevar: the name of a column that contains the variable to be summarized
##   betweenvars: a vector containing names of columns that are between-subjects variables
##   withinvars: a vector containing names of columns that are within-subjects variables
##   idvar: the name of a column that identifies each subject (or matched subjects)
##   na.rm: a boolean that indicates whether to ignore NA's
##   conf.interval: the percent range of the confidence interval (default is 95%)
summarySEwithin <- function(data=NULL, measurevar, betweenvars=NULL, withinvars=NULL,
                            idvar=NULL, na.rm=FALSE, conf.interval=.95, .drop=TRUE) {

  # Ensure that the betweenvars and withinvars are factors
  factorvars <- vapply(data[, c(betweenvars, withinvars), drop=FALSE],
    FUN=is.factor, FUN.VALUE=logical(1))

  if (!all(factorvars)) {
    nonfactorvars <- names(factorvars)[!factorvars]
    message("Automatically converting the following non-factors to factors: ",
            paste(nonfactorvars, collapse = ", "))
    data[nonfactorvars] <- lapply(data[nonfactorvars], factor)
  }

  # Get the means from the un-normed data
  datac <- summarySE(data, measurevar, groupvars=c(betweenvars, withinvars),
                     na.rm=na.rm, conf.interval=conf.interval, .drop=.drop)

  # Drop all the unused columns (these will be calculated with normed data)
  datac$sd <- NULL
  datac$se <- NULL
  datac$ci <- NULL

  # Norm each subject's data
  ndata <- normDataWithin(data, idvar, measurevar, betweenvars, na.rm, .drop=.drop)

  # This is the name of the new column
  measurevar_n <- paste(measurevar, "_norm", sep="")

  # Collapse the normed data - now we can treat between and within vars the same
  ndatac <- summarySE(ndata, measurevar_n, groupvars=c(betweenvars, withinvars),
                      na.rm=na.rm, conf.interval=conf.interval, .drop=.drop)

  # Apply correction from Morey (2008) to the standard error and confidence interval
  #  Get the product of the number of conditions of within-S variables
  nWithinGroups    <- prod(vapply(ndatac[,withinvars, drop=FALSE], FUN=nlevels,
                           FUN.VALUE=numeric(1)))
  correctionFactor <- sqrt( nWithinGroups / (nWithinGroups-1) )

  # Apply the correction factor
  ndatac$sd <- ndatac$sd * correctionFactor
  ndatac$se <- ndatac$se * correctionFactor
  ndatac$ci <- ndatac$ci * correctionFactor

  # Combine the un-normed means with the normed results
  merge(datac, ndatac)
}


# Hard-coded variable.
analyzeColumn = "PTS"
variableName = "Points"
variableUnit = "points"
analysisContext = "All Players, Total Games Stats, TIGHT WINDOW"

# Descriptive statistics with standard error adjusted for within subject comparisons.
print(summarySEwithin(data=nba_player_file, measurevar=analyzeColumn, betweenvars=NULL, withinvars="had_late_night", idvar="player_name"))
# Now perform the paired t-test.
cat("Paired t-test results below.  Every player serves as his own 'control.'",  "\n")
cat("We compare the same player's performance in games with previous-night late-night tweets versus games with NO previous-night late-night tweets.",  "\n")
print(t.test(nba_player_file[!nba_player_file$had_late_night, analyzeColumn], nba_player_file[nba_player_file$had_late_night, analyzeColumn], paired=TRUE))

# Minimally:
analyzeColumn = "PF"
print(summarySEwithin(data=nba_player_file, measurevar=analyzeColumn, betweenvars=NULL, withinvars="had_late_night", idvar="player_name"))
print(t.test(nba_player_file[!nba_player_file$had_late_night, analyzeColumn], nba_player_file[nba_player_file$had_late_night, analyzeColumn], paired=TRUE))


# Should you wish to run the function on each column.
#analysisColumns = c("FG", "FGA", "3P", "3PA", "FT", "FTA", "ORB", "DRB", "TRB", "AST", "STL", "BLK", "TOV", "PF", "PTS", "minutes")
#variableNames = c("Field Goals (2 pts) Made", "Field Goals Attempted", "3-Pointers Made", "3-Pointers Attempted", "Free Throws (1 pt) Made", "Free Throws Attempted", "Offensive Rebounds", "Defensive Rebounds", "Total Rebounds", "Assists", "Steals", "Blocks", "Turnovers", "Personal Fouls Committed", "Points Score", "Minutes Played")
#variableUnits = c("field goals made", "field goals attempted", "3-pointers made", "3-Pointers attempted", "free throws made", "free throws attempted", "off rebounds", "def rebounds", "tot rebounds", "assists", "steals", "blocks", "turnovers", "fouls", "points", "minutes")
#
#for(i in 1:length(analysisColumns)) {
#	showColPairedTest(nba_stats, analysisColumns[i], variableNames[i], variableUnits[i], "All Players, Total Games Stats")
#}


# Repeat the above procedure for shooting percentages.
#  Read in the player file.
#  Run a paired t-test.
#  Output summary statistics.
#  Output corrected standard errors.

# Read in the player file.
setwd("F:/Experiment Data/Sleep/NBA/EastWestGamefiles")
shoot_percents = read.csv("Shoot_Percents_11_7_AWAY_GAMES_ONLY.csv", header=TRUE)

# Hard-coded variable.
# fg_percent ft_percent tp_percent
analyzeColumn = "fg_percent"

# Remove rows with NA values.  NA indicates a 0/0 undefined percentage.
removePlayers = unique(shoot_percents[is.na(shoot_percents$fg_percent), c("player_name")])
#dim(shoot_percents)
shoot_percents = shoot_percents[!(shoot_percents$player_name %in% removePlayers), ]
#dim(shoot_percents)

# Descriptive statistics with standard error adjusted for within subject comparisons.
print(summarySEwithin(data=shoot_percents, measurevar=analyzeColumn, betweenvars=NULL, withinvars="had_late_night", idvar="player_name"))
# Now perform the paired t-test.
print(t.test(shoot_percents[!shoot_percents$had_late_night, analyzeColumn], shoot_percents[shoot_percents$had_late_night, analyzeColumn], paired=TRUE))

# Count players.
length(unique(shoot_percents$player_name))
